##Preparation for analysis
The necessary packagess were loaded.
suppressMessages(library("tidyverse"))
suppressMessages(library("plotly"))
suppressMessages(library("ggpubr"))
suppressMessages(library("DT"))
The gapminder data provided from BIG Bioinformatics was imported as a data frame object called gapminder. The columns were renamed in order to make it easier to work.
gapminder <- read_csv("gapminder_clean.csv")
gapminder <- gapminder %>%
rename(life_expectancy = `Life expectancy at birth, total (years)`,
country = `Country Name`,
co2_emissions =`CO2 emissions (metric tons per capita)`,
energy_use = `Energy use (kg of oil equivalent per capita)`,
exportsof_goods = `Exports of goods and services (% of GDP)`,
pop_dens = `Population density (people per sq. km of land area)`
)
Data filtered for the year of 1962 and plot was made for gdpPercap against CO2 Emissions. Notice that a large proportion of the data was collected on the left side of the plot. This means that the data needs to be transformed, in order to be spread and analyzed correctly. Hence data was transformed in log scale and a plot was made for the transformed data as well.
gapminder_1962 <- gapminder %>%
filter(Year == "1962") %>%
select(co2_emissions,gdpPercap)
plot1 <- ggplot(gapminder_1962,
aes(x = co2_emissions,y = gdpPercap)) +
ggtitle("Non-transformed data.") +
geom_point(na.rm = TRUE) +
xlim(0,20) +
ylim(0,25000) +
geom_smooth(method = "lm", na.rm = TRUE, se = FALSE)
ggplotly(plot1)
gapminder_1962 <- gapminder %>%
filter(Year == "1962") %>%
select(co2_emissions,gdpPercap) %>%
mutate(co2_emission_transformed = log(co2_emissions),
gdpPercap_transformed = log(gdpPercap))
plot62 <- ggplot(gapminder_1962,aes(x = co2_emission_transformed,
y = gdpPercap_transformed)) +
ggtitle("Transformed data.") +
geom_point(na.rm = TRUE) +
geom_smooth(method = "lm", na.rm = TRUE, se = FALSE)
ggplotly(plot62)
Pearson correlation test applied to transformed data. The R-value and associated p-value were extracted inside from the result of the correlation test.
tested <- cor.test(gapminder_1962$gdpPercap_transformed,
gapminder_1962$co2_emission_transformed,
method = "pearson")
rvalue2 <- tested$estimate
plvalue2 <- tested$p.value
datatable(gapminder_1962, options = list(scrollX = TRUE,
scrollY = TRUE),
caption = "Table 1: Gapminder data from the year 1962")
cat("From the transformed data the R value is :" , rvalue2 ,
"and P value is :" ,plvalue2)
## From the transformed data the R value is : 0.8602081 and P value is : 8.903567e-33
To extract the year where the maximum correlation between CO2 emissions and GDP per Capital, the steps followed:
gapminder data grouped by year.gdpPercap and CO2 emissions columns. Transformed columns were saved as new columns.Correlation values of each year were calculated and added as a column.correlation values.gapminder_t <- gapminder %>%
group_by(Year) %>%
mutate(co2_emission_transformed = log(co2_emissions),
gdpPercap_transformed = log(gdpPercap)) %>%
select(co2_emission_transformed, gdpPercap_transformed, Year) %>%
na.omit() %>%
mutate(correlation = cor(gdpPercap_transformed,co2_emission_transformed,
method = "pearson")) %>%
distinct(Year,correlation) %>%
arrange(desc(correlation))
max_year <- as.double(gapminder_t[1,1])
gapminder_t %>%
arrange(desc(Year))%>%
datatable( options = list(scrollX = TRUE,
scrollY = TRUE),
caption = "Table 2: Correlation values across the years")
cat("In", max_year, "correlation between Co2 emissions and gdp per capita is strongest.")
## In 2002 correlation between Co2 emissions and gdp per capita is strongest.
year <- ggplot(gapminder_t, aes(x = Year, y = correlation)) +
geom_point() +
geom_line() +
ggtitle("correlation values across the years")
ggplotly(year)
After estimating the year where the maximum correlation between CO2 emissions and GDP per capita as 2002, the data from the year 2002 was filtered from the raw gapminder data and saved as gapminder_max A scatter plot was plotted to visualize the correlation between CO2 emissions and GDP per capita.
gapminder_max <- gapminder %>%
filter(Year == max_year) %>%
mutate(co2_transformed = log(co2_emissions),
gdpPercap_transformed = log(gdpPercap)) %>%
ggplot(aes(x = co2_transformed,
y = gdpPercap_transformed,
color = continent,
size = pop)) +
ggtitle("Transformed data from the year 2002") +
geom_point(na.rm = TRUE)
ggplotly(gapminder_max)
energy use of each continent was plotted. The difference between the continents for energy use noticed.continents, a `one-way ANOVA test is conducted. The result of the ANOVA test summarized.Asia-Americas and Oceania-Europe were higher than 0.05, so they were identified as an outlier.gapminder_anova <- gapminder %>%
select(continent,energy_use) %>%
na.omit()
anovaplot <- gapminder %>%
na.omit() %>%
ggplot(aes(x = continent, y = energy_use, fill = continent)) +
geom_boxplot() +
scale_y_log10() +
labs(title="Continents versus Energy Use") +
ylab("Energy Use (kg of oil equivalent per capita)") +
xlab("Continent")
ggplotly(anovaplot)
anova <- aov(energy_use ~ continent,
data = gapminder_anova)
print(summary(anova))
tukey <- TukeyHSD(anova)
tukey <- tukey$continent %>%
data.frame() %>%
rownames_to_column(var = "Comparisons")
tukeyp <- ggplot(tukey, aes(Comparisons, diff)) +
geom_point(aes(text = paste("Mean difference:", signif(diff,5)))) +
geom_errorbar(aes(ymin=lwr, ymax=upr), width=.2,
position=position_dodge(0.05)) +
geom_hline(yintercept=0, linetype="dashed",
color = "red", size=1) + ylab("Differences in mean") + theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1)) +
labs(title = "Continents versus Energy Use, Tukey results")
ggplotly(tukeyp)
cat("Except for Asia-Americas and Oceania-Europe, the results are significant.")
## Except for Asia-Americas and Oceania-Europe, the results are significant.
Question: Is there a significant difference between Europe and Asia with respect to Imports of goods and services (% of GDP) in the years after 1990?
Gapminder raw data was filtered after 1990 by selecting Year, Imports of goods and services, continent columns for Europe or Asia continentImports of goods and services in Europe and Asia from 1992-2007 was plotted to visualize.Europe and Asia and the summary of the results was printed.Europe and Asia with respect to Imports of goods and services.gapminder_after90 <- gapminder %>%
filter(Year >= 1990, continent == "Europe" | continent == "Asia" ) %>%
select(Year,exportsof_goods, continent) %>%
na.omit()
after90 <- ggboxplot(gapminder_after90, x = "Year",
y = "exportsof_goods",
fill = "continent",
palette = c("#FC4E07", "#E7B800"))+
ylab("Imports of Goods and Services (% of GDP)")+
xlab("Years")+
labs(title="Goods & services in Europe and Asia")
two_anova <- aov(exportsof_goods
~ Year + continent, data = gapminder_after90)
ggplotly(after90)%>%
layout(width = 650, boxmode = "group")
print(summary(two_anova))
## Df Sum Sq Mean Sq F value Pr(>F)
## Year 1 4439 4439 5.667 0.0182 *
## continent 1 1345 1345 1.717 0.1915
## Residuals 209 163718 783
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("P values are too big, hence there is no significant difference found.")
## P values are too big, hence there is no significant difference found.
Question : What is the country that has the highest population density across all years?
gapminder raw data was filtered again to create a pop_dens object by selecting the country name and population density columns.population density was calculated for each country and the results were added as a new column called average.population density. Thus, it has been ensured that the maximum population density is at the top of the data frame.country with maximum population density and its population density value was extracted from the pop_densobject and printed.population density values were also extracted and plotted.pop_dens <- gapminder %>%
select(country,
pop_dens) %>%
na.omit() %>%
group_by(country) %>%
summarize(average = mean(pop_dens)) %>%
arrange(desc(average))
max_pop_dens_c <- as.character(pop_dens[1,1])
max_pop_dens <- as.double(pop_dens[1,2])
topdens <- pop_dens %>%
head(20) %>%
transform(country=reorder(country, average))
ggplotly(ggplot(topdens, aes(x = average, y = country , color = average)) +
geom_point() +
ggtitle("Average population density across countries")+
ylab("Countries") +
xlab("Average Population Density"), width = 600, height = 400)
cat(max_pop_dens_c, "has the highest average number of population density among the all countries with the population density of" ,max_pop_dens)
## Macao SAR, China has the highest average number of population density among the all countries with the population density of 14732.04
Question: Which country has shown the greatest increase in life expectancy since 1962?
Life expectancy values of the countries from 1962 to 2007 were saved into the gapminder_tot object.life expectancy values for each country were calculated and the results were saved as a new column called Expectancy.Expectancy in descending order. Thus, it has been ensured that maximum Expectancy is at the top of the data frame.country with themaximum expectancy and its Expectancy value was extracted from the gapminder_tot object and printed.Expectancy values were also extracted and plotted.gapminder_tot <- gapminder %>%
filter(!is.na(life_expectancy)) %>%
group_by(country) %>%
summarize(gapm_2007 = life_expectancy[which(Year == 2007)],
gapm_1962 = life_expectancy[which(Year == 1962)],
Expectancy = gapm_2007 - gapm_1962) %>%
arrange(desc(Expectancy)) %>%
transform(Country.Name=reorder(country, Expectancy))
gapminder_tot %>%
select(country,gapm_2007, gapm_1962, Expectancy)%>%
datatable(options = list(scrollX = TRUE,
scrollY = TRUE),
caption = "Table 3: Life Expectancy tables for countries")
gapminder_tot <- gapminder_tot %>%
select(country, Expectancy)
max_count <- as.character(gapminder_tot[1,1])
max_life <- as.double(gapminder_tot[1,2])
top20 <- gapminder_tot %>%
head(20) %>%
transform(country=reorder(country, Expectancy))
ggplotly(ggplot(top20, aes(x = Expectancy, y = country,color = Expectancy)) +
geom_point() +
ggtitle("Life expectancy across countries")+
ylab("Countries") +
xlab("Life Expectancy"), width = 600, height = 400)
cat(max_count,"has the maximum life expectancy with the expectancy of",max_life)
## Maldives has the maximum life expectancy with the expectancy of 36.91615